#############################Data Development#############################

#Format Tweets & Merge with Metadata
#Create Panel Datasets 

#Input: tweets, metadata, coded data
#Output: aggregated panel data for analysis 

#Note: Twitter data cannot be released per researcher agreement
#Any calls to the subfolder 'data' will produce an error
#This is the script used to create the panel datasets used in the analysis
###########################################################################
library(pacman)
pacman::p_load(tidyverse, dplyr, lubridate, zoo, ggplot2,
               wordVectors, tsne, stm, tm, topicmodels,
               corpus, quanteda, stringr, stringi, cld3,
               urltools, longurl, plm, sandwich, lmtest,
               broom, this.path)

setwd("~/Dropbox/Venezuela_Opposition/")


#Read in tweets
tweets<-read_csv("data/venezuela_exile_2013_2017.csv")
#tweets$long_object.body<-NA
tweets2<-read_csv("data/venezuela_exile_2017_2020.csv")
tweets<-rbind(tweets, tweets2)

#set working directory
setwd(dirname(this.path()))

#load metadata
metadata<-read_csv("ReplicationData/metadata.csv")
metadata$date_of_exile<-as.Date(metadata$date_of_exile)

#Remove NAs from Twitter data
tweets<-tweets[!is.na(tweets$body),]

#Format Tweet Date
tweets$date<-as.Date(tweets$object.postedTime, format="%a %b %d %H:%M:%S %z %Y")

#Merge (both primary and secondary twitter handles)
data<-data.frame(inner_join(tweets, metadata, by="actor.id"))

############################
#####Language detection######
############################

#Create clean long version of tweets (not cut off if more than 140 characters)
data$text_l<-NA
data$text_l[is.na(data$long_object.body)]<-data$body[is.na(data$long_object.body)] 
data$text_l[!is.na(data$long_object.body)]<-data$long_object.body[!is.na(data$long_object.body)] 
data$text_l[!is.na(data$object.long_object.body)]<-paste(sapply(strsplit(data$body[is.na(data$object.long_object.body)==F],":"), `[`, 1), ": ", data$object.long_object.body[is.na(data$object.long_object.body)==F], sep="")

#use google cld to detect language (cld3 package)

#removing mentions and links
data$clean<-gsub("@\\w+ *", "", data$text_l)
data$clean<-gsub("#", "", data$clean)
data$clean<-tolower(gsub("http\\S*", "", data$clean))

data$language_cld<-detect_language(data$clean)

data$language_simp<-NA
data$language_simp[data$language_cld=='en']<-'english'
data$language_simp[is.na(data$language_simp)]<-'spanish'

##################################
######## Word2Vec Dictionaries####
##################################
#remotes::install_github("bmschmidt/wordVectors")

english_data<-subset(data, language_simp=='english')
english_data$text_tokens<-tokens(english_data$clean, remove_punct=TRUE, remove_symbols=T, remove_url=T)
english_data$text_tokens<-tokens_remove(tokens(english_data$clean, remove_punct=TRUE, remove_symbols=T, remove_url=T), stopwords('english'))
english_data$tokens_stemmed<-tokens_wordstem(english_data$text_tokens, language='english')
english_data$text_nstops<-sapply(english_data$text_tokens, function(x) paste(x, collapse=' '))
english_data$text_stemmed<-sapply(english_data$tokens_stemmed, function(x) paste(x, collapse=' '))
english_data$text_tokens<-NULL
english_data$tokens_stemmed<-NULL

spanish_data<-subset(data, language_simp=='spanish')
spanish_data$text_tokens<-tokens_remove(tokens(spanish_data$clean, remove_punct=TRUE, remove_symbols=T, remove_url=T), stopwords('spanish'))
spanish_data$text_tokens<-tokens_remove(spanish_data$text_tokens, "\\p{Z}",valuetype = "regex")#remove bad/empty tokens
spanish_data$tokens_stemmed<-tokens_wordstem(spanish_data$text_tokens, language='spanish')
spanish_data$text_nstops<-sapply(spanish_data$text_tokens, function(x) paste(x, collapse=' '))
spanish_data$text_stemmed<-sapply(spanish_data$tokens_stemmed, function(x) paste(x, collapse=' '))
spanish_data$text_tokens<-NULL
spanish_data$tokens_stemmed<-NULL
data<-rbind(spanish_data, english_data)

data$text_stemmed<-stri_trans_general(str = tolower(data$text_stemmed),id = "Latin-ASCII")
data$text_nstops<-stri_trans_general(str = tolower(data$text_nstops),id = "Latin-ASCII")

#Word2Vec embeddings to expand dictionaries 
#text<-as.character(data$text_nstops)
#write(text, "data/word2vec/tweet_text.txt")
#prep_word2vec(origin="data/word2vec/tweet_text.txt",destination="data/word2vec/tweet_prep.txt",lowercase=T,bundle_ngrams=2)
#clean nonutf8 characters in terminal
#iconv -f utf-8 -t utf-8 -c tweet_prep.txt > tweet_prep_clean.txt
#model = train_word2vec("data/word2vec/tweet_prep.txt","data/word2vec/tweet_vectors_June2020.bin",vectors=100,threads=4,window=12,iter=5,negative_samples=0, force=TRUE)

#model=read.binary.vectors("data/word2vec/tweet_vectors_June2020.bin")

###########################################
################Foreign military terms
#military_word2vec<-model %>% 
#  closest_to(model[[as.character('intervencion_militar')]],100)
#write_csv(military_word2vec, "data/word2vec/military_word2vec.csv")

#blockade_word2vec<-model %>% 
#  closest_to(model[[as.character('bloqueo_maritimo')]],100)
#write_csv(blockade_word2vec, "data/word2vec/blockade_word2vec.csv")

# military_english_word2vec<-model %>% 
#   closest_to(model[[as.character('military_intervention')]],100)
# #write_csv(military_english_word2vec, "data/word2vec/military_english_word2vec.csv")
# 
# r2p_word2vec<-model %>% 
#   closest_to(model[[as.character('r2p')]],100)
# #write_csv(r2p_word2vec, "data/word2vec/r2p_word2vec.csv")
# 

#military<-subset(read.csv('data/word2vec/military_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#blockade<-subset(read.csv('data/word2vec/blockade_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#military_english<-subset(read.csv('data/word2vec/military_english_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word

#intervention_terms<-gsub('_', ' ', unique(c(military, blockade, military_english)))

#write.csv(intervention_terms, 'ReplicationData/intervention_terms.csv', row.names = F)

intervention_terms<-read.csv('ReplicationData/intervention_terms.csv', col.names = 'term')$term


##########################################
###sanctions
# sanctions_word2vec<-model %>% 
#   closest_to(model[[as.character('sanciones',
#                                  'sancion',
#                                  'economicas')]],100)
# #write_csv(sanctions_word2vec, "data/word2vec/sanctions_word2vec.csv")
# 
# sanctions_english_word2vec<-model %>% 
#   closest_to(model[[as.character('sanctions',
#                                  'sanction',
#                                  'economic')]],100)
# #write_csv(sanctions_english_word2vec, "data/word2vec/sanctions_english_word2vec.csv")
# 
# #sanctions<-subset(read.csv('data/word2vec/sanctions_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# #sanctions_english<-subset(read.csv('data/word2vec/sanctions_english_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# 
# #sanctions_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(sanctions, sanctions_english))))
# 
# #write.csv(sanctions_terms, 'sanctions_terms.csv', row.names = F)

sanctions_terms<-read.csv('ReplicationData/sanctions_terms.csv', col.names = 'term')$term

#########################################
##########Diplomacy
# dialogue_word2vec<-model %>% 
#   closest_to(model[[as.character('dialogo')]],100)
# #write_csv(dialogue_word2vec, "data/word2vec/dialogue_word2vec.csv")
# 
# dialogue_english_word2vec<-model %>% 
#   closest_to(model[[as.character('dialogue')]],100)
# #write_csv(dialogue_english_word2vec, "data/word2vec/dialogue_english_word2vec.csv")
# 
# pressure_word2vec<-model %>% 
#   closest_to(model[[as.character('presion_diplomatica')]],100)
# #write_csv(pressure_word2vec, "data/word2vec/pressure_word2vec.csv")
# 
# pressure_english_word2vec<-model %>% 
#   closest_to(model[[as.character('pressure')]],100)
# #write_csv(pressure_english_word2vec, "data/word2vec/pressure_english_word2vec.csv")
# 
# diplomacy_word2vec<-model %>% 
#   closest_to(model[[as.character('diplomatico')]],100)
# #write_csv(diplomacy_word2vec, "data/word2vec/diplomacy_word2vec.csv")

#dialogue<-subset(read.csv('data/word2vec/dialogue_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#dialogue_english<-subset(read.csv('data/word2vec/dialogue_english_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#pressure<-subset(read.csv('data/word2vec/pressure_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#pressure_english<-subset(read.csv('data/word2vec/pressure_english_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#diplomacy<-subset(read.csv('data/word2vec/diplomacy_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word

#diplomacy_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(dialogue, dialogue_english,
#                                                            pressure, pressure_english,
#                                                            diplomacy))))

#write.csv(diplomacy_terms, 'diplomacy_terms.csv', row.names = F)

diplomacy_terms<-read.csv('ReplicationData/diplomacy_terms.csv', col.names = 'term')$term

###############################
#######referencing US
# 
# US_word2vec<-model %>% 
#   closest_to(model[[as.character('eeuu')]],100)
# #write_csv(US_word2vec, "data/word2vec/US_word2vec.csv")
# 
# miami_word2vec<-model %>% 
#   closest_to(model[[as.character('miami')]],100)
# #write_csv(miami_word2vec, "data/word2vec/miami_word2vec.csv")
# 
# eeuu_word2vec<-model %>% 
#   closest_to(model[[as.character('estados_unidos')]],100)
# #write_csv(eeuu_word2vec, "data/word2vec/eeuu_word2vec.csv")
# 
# dc_word2vec<-model %>% 
#   closest_to(model[[as.character('washington')]],100)
# #write_csv(dc_word2vec, "data/word2vec/dc_word2vec.csv")
# 
# obama_word2vec<-model %>% 
#   closest_to(model[[as.character('obama')]],100)
# #write_csv(obama_word2vec, "data/word2vec/obama_word2vec.csv")
# 
# clinton_word2vec<-model %>% 
#   closest_to(model[[as.character('clinton')]],100)
# #write_csv(clinton_word2vec, "data/word2vec/clinton_word2vec.csv")
# 
# kerry_word2vec<-model %>% 
#   closest_to(model[[as.character('john_kerry')]],100)
# #write_csv(kerry_word2vec, "data/word2vec/kerry_word2vec.csv")
# 
# trump_word2vec<-model %>% 
#   closest_to(model[[as.character('trump')]],100)
# #write_csv(trump_word2vec, "data/word2vec/trump_word2vec.csv")
# 
# bolton_word2vec<-model %>% 
#   closest_to(model[[as.character('bolton')]],100)
# #write_csv(bolton_word2vec, "data/word2vec/bolton_word2vec.csv")

#us<-subset(read.csv('data/word2vec/US_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#miami<-subset(read.csv('data/word2vec/miami_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#eeuu<-subset(read.csv('data/word2vec/eeuu_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#dc<-subset(read.csv('data/word2vec/dc_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#kerry<-subset(read.csv('data/word2vec/kerry_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#clinton<-subset(read.csv('data/word2vec/clinton_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#obama<-subset(read.csv('data/word2vec/obama_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#trump<-subset(read.csv('data/word2vec/trump_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#bolton<-subset(read.csv('data/word2vec/bolton_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word

#us_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(us, obama, trump, bolton,
#                                                     miami, eeuu, dc, clinton, kerry))))

#write.csv(us_terms, 'us_terms.csv', row.names = F)

us_terms=read.csv('ReplicationData/us_terms.csv', col.names='term')$term

##################################
######other foreign actors
# general_foreign_word2vec<-model %>% 
#   closest_to(model[[as.character('lideres_mundiales')]],100)
# #write_csv(general_foreign_word2vec, "data/word2vec/general_foreign_word2vec.csv")
# 
# europe_word2vec<-model %>% 
#   closest_to(model[[as.character('europa', 'europe')]],100)
# #write_csv(europe_word2vec, "data/word2vec/europe_word2vec.csv")
# 
# europeans_word2vec<-model %>% 
#   closest_to(model[[as.character('europeos')]],100)
# #write_csv(europeans_word2vec, "data/word2vec/europeans_word2vec.csv")
# 
# norway_word2vec<-model %>% 
#   closest_to(model[[as.character('noruega', 'norway')]],100)
# #write_csv(norway_word2vec, "data/word2vec/norway_word2vec.csv")
# 
# latam_word2vec<-model %>% 
#   closest_to(model[[as.character('latinoamericanos')]],100)
# #write_csv(latam_word2vec, "data/word2vec/latam_word2vec.csv")
# 
# panama_word2vec<-model %>% 
#   closest_to(model[[as.character('panama')]],100)
# #write_csv(panama_word2vec, "data/word2vec/panama_word2vec.csv")
# 
# colombia_word2vec<-model %>% 
#   closest_to(model[[as.character('colombia')]],100)
# #write_csv(colombia_word2vec, "data/word2vec/colombia_word2vec.csv")
# 
# 
# duque_word2vec<-model %>% 
#   closest_to(model[[as.character('duque')]],100)
# #write_csv(duque_word2vec, "data/word2vec/duque_word2vec.csv")
# #note added other regional presidents, did not include local politicians
# 
# chile_word2vec<-model %>% 
#   closest_to(model[[as.character('chile')]],100)
# #write_csv(chile_word2vec, "data/word2vec/chile_word2vec.csv")
# 
# canada_word2vec<-model %>% 
#   closest_to(model[[as.character('canada')]],100)
# #write_csv(canada_word2vec, "data/word2vec/canada_word2vec.csv")
# 
# UN_word2vec<-model %>% 
#   closest_to(model[[as.character('onu')]],100)
# #write_csv(UN_word2vec, "data/word2vec/UN_word2vec.csv")
# 

#europe<-subset(read.csv('data/word2vec/europe_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#europeans<-subset(read.csv('data/word2vec/europeans_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#norway<-subset(read.csv('data/word2vec/norway_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#latam<-subset(read.csv('data/word2vec/latam_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#panama<-subset(read.csv('data/word2vec/panama_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#duque<-subset(read.csv('data/word2vec/duque_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#colombia<-subset(read.csv('data/word2vec/colombia_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#chile<-subset(read.csv('data/word2vec/chile_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#general<-subset(read.csv('data/word2vec/general_foreign_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#canada<-subset(read.csv('data/word2vec/canada_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#un<-subset(read.csv('data/word2vec/UN_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word


#foreign_actors<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(europe, europeans, norway, latam, panama,
#                                                           duque, colombia, chile, general,
#                                                           canada, un))))
#write.csv(foreign_actors, 'foreign_actors_terms.csv', row.names=F)

colombia_specific_terms<-read.csv('ReplicationData/colombia_terms.csv', col.names = 'term')$term
foreign_actors<-read.csv('ReplicationData/foreign_actors_terms.csv', col.names = 'term')$term


##################################
######Cuba/Russia
# cuba_word2vec<-model %>% 
#   closest_to(model[[as.character('cuba')]],100)
# #write_csv(cuba_word2vec, "data/word2vec/cuba_word2vec.csv")
# 
# castro_word2vec<-model %>% 
#   closest_to(model[[as.character('castro')]],100)
# #write_csv(castro_word2vec, "data/word2vec/castro_word2vec.csv")
# 
# ruso_word2vec<-model %>% 
#   closest_to(model[[as.character('rusos')]],100)
# #write_csv(ruso_word2vec, "data/word2vec/ruso_word2vec.csv")
# 
# putin_word2vec<-model %>% 
#   closest_to(model[[as.character('putin')]],100)
# #write_csv(putin_word2vec, "data/word2vec/putin_word2vec.csv")
# 
# #cuba<-subset(read.csv('data/word2vec/cuba_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# #castro<-subset(read.csv('data/word2vec/castro_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# #ruso<-subset(read.csv('data/word2vec/ruso_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# #putin<-subset(read.csv('data/word2vec/putin_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# 
# #cuba_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(cuba, castro, ruso, putin))))
# 
# #write.csv(cuba_terms, 'cuba_terms.csv', row.names=F)

cuba_terms<-read.csv('ReplicationData/cuba_terms.csv', col.names = 'term')$term

##########################################
#####protest
# protest_word2vec<-model %>% 
#   closest_to(model[[as.character("manifestacion")]],100)
# #write_csv(protest_word2vec, "data/word2vec/protest_word2vec.csv")
# 
# mobilization_word2vec<-model %>% 
#   closest_to(model[[as.character("movilizacion")]],100)
# #write_csv(mobilization_word2vec, "data/word2vec/mobilization_word2vec.csv")
# 
# protesta_word2vec<-model %>% 
#   closest_to(model[[as.character("protesta")]],100)
# #write_csv(protesta_word2vec, "data/word2vec/protesta_word2vec.csv")
# 
# protest_englishword2vec<-model %>% 
#   closest_to(model[[as.character('protest', 'protests')]],100)
# #write_csv(protest_englishword2vec, "data/word2vec/protest_englishword2vec.csv")
# 
# #protest<-subset(read.csv('data/word2vec/protest_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# #mobilization<-subset(read.csv('data/word2vec/mobilization_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# #protesta<-subset(read.csv('data/word2vec/protesta_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# #protest_english<-subset(read.csv('data/word2vec/protest_englishword2vec.csv', stringsAsFactors=F), keep_terms==1)$word
# 
# #protest_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(protest, protesta, mobilization, protest_english))))
# 
# #write.csv(protest_terms, 'protest_terms.csv', row.names=F)

protest_terms<-read.csv('ReplicationData/protest_terms.csv', col.names = 'term')$term

#############################################
##########exile
# exile_word2vec<-model %>% 
#   closest_to(model[[as.character('exilio')]],100)
# #write_csv(exile_word2vec, "data/word2vec/exile_word2vec.csv")
# 
# exile_englishword2vec<-model %>% 
#   closest_to(model[[as.character("exile", 'exiled', 'expelled')]],100)
#write_csv(exile_englishword2vec, "data/word2vec/exile_englishword2vec.csv")

#exile<-subset(read.csv('data/word2vec/exile_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#exile_english<-subset(read.csv('data/word2vec/exile_englishword2vec.csv', stringsAsFactors=F), keep_terms==1)$word

#exile_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(exile, exile_english, 'exiled'))))

#write.csv(exile_terms, 'exile_terms.csv', row.names=F)

exile_terms<-read.csv('ReplicationData/exile_terms.csv', col.names = 'term')$term


#############################################
#########political prisoners
# presos_word2vec<-model %>% 
#   closest_to(model[[as.character("preso_politico")]],100)
# #write_csv(presos_word2vec, "data/word2vec/presos_word2vec.csv")
# 
# encarcelado_word2vec<-model %>% 
#   closest_to(model[[as.character("encarcelado")]],100)
# #write_csv(encarcelado_word2vec, "data/word2vec/encarcelado_word2vec.csv")
# 
# encerrado_word2vec<-model %>% 
#   closest_to(model[[as.character("encerrado")]],100)
# #write_csv(encerrado_word2vec, "data/word2vec/encerrado_word2vec.csv")
# 
# presos_english_word2vec<-model %>% 
#   closest_to(model[[as.character("prisoners")]],100)
#write_csv(presos_english_word2vec, "data/word2vec/presos_english_word2vec.csv")

#presos<-subset(read.csv('data/word2vec/presos_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#encarcelado<-subset(read.csv('data/word2vec/encarcelado_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#encerrado<-subset(read.csv('data/word2vec/encerrado_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#presos_english<-subset(read.csv('data/word2vec/presos_english_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word

#presos_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(presos, presos_english,
#                                                         encarcelado,
#                                                         encerrado))))


#write.csv(presos_terms, 'presos_terms.csv', row.names=F)

presos_terms<-read.csv('ReplicationData/presos_terms.csv', col.names = 'term')$term

################################################
############other repression

# torture_word2vec<-model %>% 
#   closest_to(model[[as.character("tortura")]],100)
# #write_csv(torture_word2vec, "data/word2vec/torture_word2vec.csv")
# 
# execucion_word2vec<-model %>% 
#   closest_to(model[[as.character("ejecuciones")]],100)
# #write_csv(execucion_word2vec, "data/word2vec/execucion_word2vec.csv")
# 
# disappearances_word2vec<-desapariciones_word2vec<-model %>% 
#   closest_to(model[[as.character("desapariciones")]],100)
# #write_csv(disappearances_word2vec, "data/word2vec/disappearances_word2vec.csv")
# 
# repression_word2vec<-model %>% 
#   closest_to(model[[as.character("represion")]],100)
# #write_csv(repression_word2vec, "data/word2vec/repression_word2vec.csv")
# 
# repression_english_word2vec<-model %>% 
#   closest_to(model[[as.character('torture', 'repression')]],100)
# #write_csv(repression_english_word2vec, "data/word2vec/repression_english_word2vec.csv")
# 
# human_rights_word2vec<-model %>% 
#   closest_to(model[[as.character("derechos_humanos")]],100)
# #write_csv(human_rights_word2vec, "data/word2vec/human_rights_word2vec.csv")
# 
# human_rights_english_word2vec<-model %>% 
#   closest_to(model[[as.character("human_rights")]],100)
# #write_csv(human_rights_english_word2vec, "data/word2vec/human_rights_english_word2vec.csv")

#torture<-subset(read.csv('data/word2vec/torture_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#repression<-subset(read.csv('data/word2vec/repression_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#ejecucion<-subset(read.csv('data/word2vec/execucion_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#disappearances<-subset(read.csv('data/word2vec/disappearances_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#repression_english<-subset(read.csv('data/word2vec/repression_english_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#humanrights<-subset(read.csv('data/word2vec/human_rights_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#humanrights_english<-subset(read.csv('data/word2vec/human_rights_english_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word

#repression_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(torture, repression, repression_english,
#                                                             humanrights, humanrights_english, ejecucion, disappearances))))

#write.csv(repression_terms, 'repression_terms.csv', row.names=F)

repression_terms<-read.csv('ReplicationData/repression_terms.csv', col.names = 'term')$term


#################################################
##########criticisms of the regime
# narco_word2vec<-model %>%
#   closest_to(model[[as.character('narco')]], 100)
# #write_csv(narco_word2vec, "data/word2vec/narco_word2vec.csv")
# 
# fascist_word2vec<-model %>% 
#   closest_to(model[[as.character("fascista")]],100)
# #write_csv(fascist_word2vec, "data/word2vec/fascist_word2vec.csv")
# 
# dictator_word2vec<-model %>% 
#   closest_to(model[[as.character("dictador")]],100)
# #write_csv(dictator_word2vec, "data/word2vec/dictator_word2vec.csv")

#narco<-subset(read.csv('data/word2vec/narco_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#fascist<-subset(read.csv('data/word2vec/fascist_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#dictator<-subset(read.csv('data/word2vec/dictator_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word

#narco_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(narco))))
#dictator_terms<-gsub('\\.', "\\\\.",gsub('_', ' ',unique(c(fascist, dictator))))

#write.csv(narco_terms, 'narco_terms.csv', row.names=F)
#write.csv(dictator_terms, 'dictator_terms.csv', row.names=F)

dictator_terms=read.csv('ReplicationData/dictator_terms.csv', col.names = 'term')$term
narco_terms=read.csv('ReplicationData/narco_terms.csv', col.names = 'term')$term

##################################################
#########services
# 
# hospital_word2vec<-model %>% 
#   closest_to(model[[as.character('hospital')]],100)
# #write_csv(hospital_word2vec, "data/word2vec/hospital_word2vec.csv")
# 
# agua_word2vec<-model %>% 
#   closest_to(model[[as.character('agua')]],100)
# #write_csv(agua_word2vec, "data/word2vec/agua_word2vec.csv")
# 
# luz_word2vec<-model %>% 
#   closest_to(model[[as.character(c(' luz ', 'electricidad'))]],100)
# #write_csv(luz_word2vec, "data/word2vec/luz_word2vec.csv")
# 
# comida_word2vec<-model %>% 
#   closest_to(model[[as.character(c('comida'))]],100)
# #write_csv(comida_word2vec, "data/word2vec/comida_word2vec.csv")

#hospital<-subset(read.csv('data/word2vec/hospital_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#agua<-subset(read.csv('data/word2vec/agua_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#luz<-subset(read.csv('data/word2vec/luz_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word
#comida<-subset(read.csv('data/word2vec/comida_word2vec.csv', stringsAsFactors=F), keep_terms==1)$word

#services_terms<-gsub('\\.', "\\\\.", gsub('_', ' ',unique(c(hospital, agua, luz, comida))))

#write.csv(services_terms, 'services_terms.csv', row.names=F)

services_terms=read.csv('ReplicationData/services_terms.csv', col.names = 'term')$term


########Dictionary analysis data

data$tweeted_exile<-NA
data$tweeted_exile[date(data$object.postedTime)<data$date_of_exile]<-0
data$tweeted_exile[date(data$object.postedTime)>=data$date_of_exile]<-1
data$tweeted_exile[is.na(data$tweeted_exile)]<-0

data$days_since_exile<-as.numeric(date(data$object.postedTime)-data$date_of_exile)

#foreign actors
data$us_terms<-str_detect(tolower(data$text_nstops), paste(us_terms, collapse="|"))
data$foreign_actors_terms<-str_detect(tolower(data$text_nstops), paste(foreign_actors, collapse="|"))
data$foreign_actors_terms<-ifelse(data$foreign_actors_terms==T&str_detect(tolower(data$text_nstops), c(' copa |futbol|soccer'))==FALSE, TRUE, FALSE)
data$colombia_specific_terms<-str_detect(tolower(data$text_nstops), paste(colombia_specific_terms, collapse='|'))

data$all_foreign_actors<-I((data$us_terms+data$foreign_actors_terms)>0)

#foreign policies
data$military_terms<-str_detect(tolower(data$text_nstops), paste(intervention_terms, collapse="|"))
data$sanctions_terms<-str_detect(tolower(data$text_nstops), paste(sanctions_terms, collapse="|"))
data$diplomacy_terms<-str_detect(tolower(data$text_nstops), paste(diplomacy_terms, collapse="|"))

data$aggressive_foreign_policy<-I((data$military_terms+data$sanctions_terms)>0)
data$all_foreign_policy<-I((data$military_terms+data$sanctions_terms+data$diplomacy_terms)>0)

#protest
data$protest_terms<-str_detect(tolower(data$text_nstops), paste(protest_terms, collapse="|"))

#repression
data$exile_terms<-str_detect(tolower(data$text_nstops), paste(exile_terms, collapse="|"))
data$presos_terms<-str_detect(tolower(data$text_nstops), paste(presos_terms, collapse="|"))
data$repression_terms<-str_detect(tolower(data$text_nstops), paste(repression_terms, collapse="|"))

data$all_repression<-I((data$presos_terms+data$repression_terms)>0)

#criticism
data$narco_terms<-str_detect(tolower(data$text_nstops), paste(narco_terms, collapse="|"))
data$dictator_terms<-str_detect(tolower(data$text_nstops), paste(dictator_terms, collapse="|"))
data$cuba_terms<-str_detect(tolower(data$text_nstops), paste(cuba_terms, collapse="|"))

data$harsh_criticism_terms<-I((data$narco_terms+data$dictator_terms+data$cuba_terms+data$all_repression)>0)

#services
data$service_terms<-str_detect(tolower(data$text_nstops), paste(services_terms, collapse="|"))


################################
######Classified tweets
setwd("~/Dropbox/Venezuela_Opposition/")

military_classified_tweets<-read.csv('data/mil_int_tweets_classified.csv')
sanctions_classified_tweets<-read.csv('data/sanctions_tweets_classified.csv')
military_classified_tweets$postedTime<-as.POSIXlt(military_classified_tweets$postedTime, format="%Y-%m-%dT%H:%M:%OS", tz='UTC')
sanctions_classified_tweets$postedTime<-as.POSIXlt(sanctions_classified_tweets$postedTime, format="%Y-%m-%dT%H:%M:%OS", tz='UTC')

military_classified_tweets<-military_classified_tweets[c('postedTime', 'actor.id', 'text_nstops',
                                                         'relevant', 'positive', 'negative')]
colnames(military_classified_tweets)[4:6]<-c('relevant_military', 'positive_military', 'negative_military')

data<-merge(data, military_classified_tweets, by=c('postedTime', 'actor.id', 'text_nstops'), all.x=T)

sanctions_classified_tweets<-sanctions_classified_tweets[c('postedTime', 'actor.id', 'text_nstops',
                                                           'relevant', 'positive', 'negative')]
colnames(sanctions_classified_tweets)[4:6]<-c('relevant_sanctions', 'positive_sanctions', 'negative_sanctions')

data<-merge(data, sanctions_classified_tweets, by=c('postedTime', 'actor.id', 'text_nstops'), all.x=T)

data$positive_military[is.na(data$positive_military)]<-0
data$negative_military[is.na(data$negative_military)]<-0
data$positive_sanctions[is.na(data$positive_sanctions)]<-0
data$negative_sanctions[is.na(data$negative_sanctions)]<-0
data$relevant_military[is.na(data$relevant_military)]<-0
data$relevant_sanctions[is.na(data$relevant_sanctions)]<-0


######################################################
############Foreign v Venezuela engagement

ven_engagement<-read.csv('data/engagement_tweets/ven_engagement_tweets.csv', stringsAsFactors=F)
foreign_engagement<-read.csv('data/engagement_tweets/foreign_engagement_tweets.csv', stringsAsFactors=F)

ven_engagement$postedTime<-as.POSIXlt(ven_engagement$postedTime, format="%Y-%m-%d %H:%M:%S", tz='UTC')
foreign_engagement$postedTime<-as.POSIXlt(foreign_engagement$postedTime, format="%Y-%m-%d %H:%M:%S", tz='UTC')

ven_engagement$ven_at<-1
foreign_engagement$foreign_at<-1

ven_engagement<-unique(ven_engagement)
foreign_engagement<-unique(foreign_engagement)

foreign_data<-merge(ven_engagement, foreign_engagement, by=c('text_nstops', 'postedTime', 'actor.id', 'actor.preferredUsername'), all=T)

engagement_data<-merge(foreign_data, data, by=c('text_nstops', 'postedTime', 'actor.id', 'actor.preferredUsername'))
engagement_data$ven_at[is.na(engagement_data$ven_at)]<-0
engagement_data$foreign_at[is.na(engagement_data$foreign_at)]<-0

engagement_panel<-data.frame(engagement_data %>%
                               group_by(actor.id, month=floor_date(date, "1 month")) %>%
                               dplyr::summarize(num_tweets=n(),
                                                tweeted_exile=min(ifelse(month>=date_of_exile, 1, 0)),
                                                date_of_exile=unique(date_of_exile),
                                                forced=unique(forced),
                                                type=unique(type),
                                                exile=unique(exile),
                                                destination=unique(destination),
                                                followers=mean(actor.followersCount),
                                                retweets=mean(retweetCount),
                                                perc_foreign_engagement=sum(foreign_at)/n()*100)%>% 
                               filter(month >= ymd("2013-01-01")) )


engagement_panel$tweeted_exile[is.na(engagement_panel$tweeted_exile)]<-0
engagement_panel$year<-year(engagement_panel$month)

#measure of months since exile
engagement_panel$months_since<-interval(engagement_panel$date_of_exile, engagement_panel$month) %/% months(1)

#variable for if forced
engagement_panel$forced[is.na(engagement_panel$forced)&engagement_panel$exile=='no']<-'no_exile'
engagement_panel$forced<-factor(engagement_panel$forced, levels=c('no_exile', 'no', 'yes'))
engagement_panel$forced_baseno<-factor(engagement_panel$forced, levels=c('no', 'no_exile', 'yes'))

#time trend
engagement_panel$trend<-engagement_panel$month

#defining leads and lags
engagement_panel$lead_lags<-NA
engagement_panel$lead_lags[is.na(engagement_panel$months_since)==F&engagement_panel$months_since==-1]<-'month_before'
engagement_panel$lead_lags[is.na(engagement_panel$months_since)==F&engagement_panel$months_since< -6]<-'pre'
engagement_panel$lead_lags[is.na(engagement_panel$months_since)==F&engagement_panel$months_since>12]<-'post'
engagement_panel$lead_lags[is.na(engagement_panel$months_since)==F&engagement_panel$months_since>=-6&engagement_panel$months_since< (-1)]<-engagement_panel$months_since[is.na(engagement_panel$months_since)==F&engagement_panel$months_since>=-6&engagement_panel$months_since< (-1)]
engagement_panel$lead_lags[is.na(engagement_panel$months_since)==F&engagement_panel$months_since<12&engagement_panel$months_since>=0]<-engagement_panel$months_since[is.na(engagement_panel$months_since)==F&engagement_panel$months_since<12&engagement_panel$months_since>=0]
engagement_panel$lead_lags[is.na(engagement_panel$months_since)]<-'missing'

engagement_panel$lead_lags<-factor(engagement_panel$lead_lags, levels=c('month_before', 'missing', 'pre', '-6',
                                                                        '-5', '-4', '-3', '-2', '0', 
                                                                        '1', '2', '3', '4', '5', '6',
                                                                        '7', '8', '9', '10', '11', 'post'))


#whether exile ended up in US
engagement_panel$US_destination<-ifelse(engagement_panel$destination=='USA', 'us', 'other')
engagement_panel$US_destination[is.na(engagement_panel$US_destination)]<-'not_exiled'
engagement_panel$US_destination<-factor(engagement_panel$US_destination, levels=c('other', 'not_exiled', 'us'))

#whether exile ended up in Colombia
engagement_panel$CO_destination<-ifelse(engagement_panel$destination=='Colombia', 'colombia', 'other')
engagement_panel$CO_destination[is.na(engagement_panel$CO_destination)]<-'not_exiled'
engagement_panel$CO_destination<-factor(engagement_panel$CO_destination, levels=c('other', 'not_exiled', 'colombia'))

setwd(dirname(this.path()))
#write.csv(engagement_panel, 'ReplicationData/engagement_panel.csv', row.names=F)

#####################################
#######PANEL DATA####################
#####################################
data$month_of_exile<-floor_date(data$date_of_exile, '1 month')
data$month_prison_start<-floor_date(as.Date(data$prison_start, '%m/%d/%y'), '1 month')
data$month_prison_end<-floor_date(as.Date(data$prison_end, '%m/%d/%y'), '1 month')


data_panel<-data.frame(data %>%
                         group_by(actor.id, month=floor_date(date, "1 month")) %>%
                         dplyr::summarize(num_tweets=n(),
                                          tweeted_exile=min(ifelse(month>=month_of_exile, 1, 0)),
                                          date_of_exile=unique(date_of_exile),
                                          month_of_exile=unique(month_of_exile),
                                          tweeted_prison=min(ifelse(month>=month_prison_start&month<=month_prison_end, 1, 0)),
                                          prison=min(ifelse(is.na(month_prison_start)==F&month_prison_end>'2013-01-01', 1, 0)),
                                          forced=unique(forced),
                                          party=unique(party),
                                          reason=unique(reason),
                                          type=unique(type),
                                          exile=unique(exile),
                                          year_elected=unique(year_elected_latest),
                                          destination=unique(destination),
                                          followers=mean(actor.followersCount),
                                          retweets=mean(retweetCount),
                                          num_military=sum(military_terms),
                                          num_sanctions=sum(sanctions_terms),
                                          num_diplomacy=sum(diplomacy_terms),
                                          num_foreign_policy=sum(all_foreign_policy),
                                          num_service=sum(service_terms),
                                          num_us=sum(us_terms),
                                          num_foreign=sum(foreign_actors_terms),
                                          num_cuba=sum(cuba_terms),
                                          num_all_foreign=sum(all_foreign_actors),
                                          num_protest=sum(protest_terms),
                                          num_exile=sum(exile_terms),
                                          num_presos=sum(presos_terms),
                                          num_repression=sum(repression_terms),
                                          num_all_repression=sum(all_repression),
                                          num_harsh_criticism=sum(harsh_criticism_terms),
                                          num_relevant_military=sum(relevant_military),
                                          num_relevant_sanctions=sum(relevant_sanctions),
                                          num_positive_military=sum(positive_military),
                                          num_positive_sanctions=sum(positive_sanctions),
                                          perc_military=sum(military_terms)/n()*100,
                                          perc_sanctions=sum(sanctions_terms)/n()*100,
                                          perc_diplomacy=sum(diplomacy_terms)/n()*100,
                                          perc_aggressive_fp=sum(aggressive_foreign_policy)/n()*100,
                                          perc_foreign_policy=sum(all_foreign_policy)/n()*100,
                                          perc_us=sum(us_terms)/n()*100,
                                          perc_foreign=sum(foreign_actors_terms)/n()*100,
                                          perc_cuba=sum(cuba_terms)/n()*100,
                                          perc_all_foreign=sum(all_foreign_actors)/n()*100,
                                          perc_protest=sum(protest_terms)/n()*100,
                                          perc_exile=sum(exile_terms)/n()*100,
                                          perc_presos=sum(presos_terms)/n()*100,
                                          perc_repression=sum(repression_terms)/n()*100,
                                          perc_all_repression=sum(all_repression)/n()*100,
                                          perc_harsh_criticism=sum(harsh_criticism_terms)/n()*100,
                                          perc_narco=sum(narco_terms)/n()*100,
                                          perc_dictator=sum(dictator_terms)/n()*100,
                                          perc_service=sum(service_terms)/n()*100,
                                          perc_colombia=sum(colombia_specific_terms)/n()*100,
                                          perc_english=sum(I(language_simp=='english'))/n()*100,
                                          perc_relevant_military=sum(relevant_military)/n()*100,
                                          perc_relevant_sanctions=sum(relevant_sanctions)/n()*100,
                                          perc_positive_military=sum(positive_military)/n()*100,
                                          perc_negative_military=sum(negative_military)/n()*100,
                                          perc_positive_sanctions=sum(positive_sanctions)/n()*100,
                                          perc_negative_sanctions=sum(negative_sanctions)/n()*100)%>% 
                         filter(month >= ymd("2013-01-01")) )

data_panel$tweeted_prison[is.na(data_panel$tweeted_prison)]<-0

##for two who returned to Venezuela, adding zeros for after return months
data_panel$tweeted_exile<-ifelse(data_panel$actor.id=='id:twitter.com:71076388'&data_panel$month>='2015-11-01', 0, data_panel$tweeted_exile)
data_panel$tweeted_exile<-ifelse(data_panel$actor.id=='id:twitter.com:33718836'&data_panel$month>='2020-03-01', 0, data_panel$tweeted_exile)

#save off to create a balanced panel
dp_balance<-data_panel

data_panel$tweeted_exile[is.na(data_panel$tweeted_exile)]<-0
data_panel$year<-year(data_panel$month)

#measure of months since exile
data_panel$months_since<-interval(data_panel$month_of_exile, data_panel$month) %/% months(1)

#variable for if forced
data_panel$forced[is.na(data_panel$forced)&data_panel$exile=='no']<-'no_exile'
data_panel$forced<-factor(data_panel$forced, levels=c('no_exile', 'no', 'yes'))
data_panel$forced_baseno<-factor(data_panel$forced, levels=c('no', 'no_exile', 'yes'))

#time trend
data_panel$trend<-data_panel$month

#defining leads and lags
data_panel$lead_lags<-NA
data_panel$lead_lags[is.na(data_panel$months_since)==F&data_panel$months_since==-1]<-'month_before'
data_panel$lead_lags[is.na(data_panel$months_since)==F&data_panel$months_since< -6]<-'pre'
data_panel$lead_lags[is.na(data_panel$months_since)==F&data_panel$months_since>12]<-'post'
data_panel$lead_lags[is.na(data_panel$months_since)==F&data_panel$months_since>=-6&data_panel$months_since< (-1)]<-data_panel$months_since[is.na(data_panel$months_since)==F&data_panel$months_since>=-6&data_panel$months_since< (-1)]
data_panel$lead_lags[is.na(data_panel$months_since)==F&data_panel$months_since<12&data_panel$months_since>=0]<-data_panel$months_since[is.na(data_panel$months_since)==F&data_panel$months_since<12&data_panel$months_since>=0]
data_panel$lead_lags[is.na(data_panel$months_since)]<-'pre'

data_panel$lead_lags<-factor(data_panel$lead_lags, levels=c('month_before', 'pre', '-6',
                                                            '-5', '-4', '-3', '-2', '0', 
                                                            '1', '2', '3', '4', '5', '6',
                                                            '7', '8', '9', '10', '11', 'post'))


data_panel$lead_lags_b<-NA
data_panel$lead_lags_b[is.na(data_panel$months_since)==F&data_panel$months_since==-1]<-'month_before'
data_panel$lead_lags_b[is.na(data_panel$months_since)==F&data_panel$months_since< -6]<-'pre'
data_panel$lead_lags_b[is.na(data_panel$months_since)==F&data_panel$months_since>12]<-'post'
data_panel$lead_lags_b[is.na(data_panel$months_since)==F&data_panel$months_since>=-6&data_panel$months_since< (-1)]<-data_panel$months_since[is.na(data_panel$months_since)==F&data_panel$months_since>=-6&data_panel$months_since< (-1)]
data_panel$lead_lags_b[is.na(data_panel$months_since)==F&data_panel$months_since<12&data_panel$months_since>=0]<-data_panel$months_since[is.na(data_panel$months_since)==F&data_panel$months_since<12&data_panel$months_since>=0]
data_panel$lead_lags_b[data_panel$exile=='no']<-'month_before'

data_panel$lead_lags_b<-factor(data_panel$lead_lags_b, levels=c('month_before', 'pre', '-6',
                                                                '-5', '-4', '-3', '-2', '0', 
                                                                '1', '2', '3', '4', '5', '6',
                                                                '7', '8', '9', '10', '11', 'post'))


#whether exile ended up in US
data_panel$US_destination<-ifelse(data_panel$destination=='USA', 'us', 'other')
data_panel$US_destination[is.na(data_panel$US_destination)&data_panel$exile=='no']<-'not_exiled'
data_panel$US_destination<-factor(data_panel$US_destination, levels=c('other', 'not_exiled', 'us'))

#whether exile ended up in Colombia
data_panel$CO_destination<-ifelse(data_panel$destination=='Colombia', 'colombia', 'other')
data_panel$CO_destination[is.na(data_panel$CO_destination)&data_panel$exile=='no']<-'not_exiled'
data_panel$CO_destination<-factor(data_panel$CO_destination, levels=c('other', 'not_exiled', 'colombia'))

data_panel$trend<-as.numeric(as.character(data_panel$year))

#write.csv(data_panel, 'ReplicationData/data_panel.csv', row.names=F)


########################################
###########Balanced panel###############
########################################
dp_balance<-data.frame(dp_balance %>% complete(actor.id, nesting(month),
                                               fill=list(num_tweets=0, perc_military=0,
                                                         perc_sanctions=0, perc_diplomacy=0, perc_aggressive_fp=0,
                                                         perc_foreign_policy=0, perc_us=0, perc_foreign=0, perc_cuba=0, perc_all_foreign=0, 
                                                         perc_protest=0, perc_exile=0, perc_presos=0, perc_repression=0, perc_all_repression=0,
                                                         perc_election=0, perc_harsh_criticism=0, perc_narco=0, perc_dictator=0,
                                                         perc_democracy=0, perc_english=0, perc_relevant_military=0, perc_relevant_sanctions=0,
                                                         perc_positive_military=0, perc_negative_military=0, perc_positive_sanctions=0,
                                                         perc_negative_sanctions=0)))

dp_balance<-data.frame(dp_balance %>% group_by(actor.id)%>%
                         fill(date_of_exile, forced, type, year_elected, destination,
                              followers, tweeted_exile, exile, retweets, .direction='downup'))

dp_balance$tweeted_exile[is.na(dp_balance$tweeted_exile)]<-0

dp_balance$year<-year(dp_balance$month)

#measure of months since exile
dp_balance$months_since<-interval(dp_balance$date_of_exile, dp_balance$month) %/% months(1)

#variable for if forced
dp_balance$forced[is.na(dp_balance$forced)&dp_balance$exile=='no']<-'no_exile'
dp_balance$forced<-factor(dp_balance$forced, levels=c('no_exile', 'no', 'yes'))
dp_balance$forced_baseno<-factor(dp_balance$forced, levels=c('no', 'no_exile', 'yes'))

#time trend
dp_balance$trend<-dp_balance$month

#defining leads and lags
dp_balance$lead_lags<-NA
dp_balance$lead_lags[is.na(dp_balance$months_since)==F&dp_balance$months_since==-1]<-'month_before'
dp_balance$lead_lags[is.na(dp_balance$months_since)==F&dp_balance$months_since< -6]<-'pre'
dp_balance$lead_lags[is.na(dp_balance$months_since)==F&dp_balance$months_since>12]<-'post'
dp_balance$lead_lags[is.na(dp_balance$months_since)==F&dp_balance$months_since>=-6&dp_balance$months_since< (-1)]<-dp_balance$months_since[is.na(dp_balance$months_since)==F&dp_balance$months_since>=-6&dp_balance$months_since< (-1)]
dp_balance$lead_lags[is.na(dp_balance$months_since)==F&dp_balance$months_since<12&dp_balance$months_since>=0]<-dp_balance$months_since[is.na(dp_balance$months_since)==F&dp_balance$months_since<12&dp_balance$months_since>=0]
dp_balance$lead_lags[is.na(dp_balance$months_since)]<-'missing'

dp_balance$lead_lags<-factor(dp_balance$lead_lags, levels=c('month_before', 'missing', 'pre', '-6',
                                                            '-5', '-4', '-3', '-2', '0', 
                                                            '1', '2', '3', '4', '5', '6',
                                                            '7', '8', '9', '10', '11', 'post'))

#whether exile ended up in US
dp_balance$US_destination<-ifelse(dp_balance$destination=='USA', 'us', 'other')
dp_balance$US_destination[is.na(dp_balance$US_destination)]<-'not_exiled'
dp_balance$US_destination<-factor(dp_balance$US_destination, levels=c('other', 'not_exiled', 'us'))

#whether exile ended up in Colombia
dp_balance$CO_destination<-ifelse(dp_balance$destination=='Colombia', 'colombia', 'other')
dp_balance$CO_destination[is.na(dp_balance$CO_destination)]<-'not_exiled'
dp_balance$CO_destination<-factor(dp_balance$CO_destination, levels=c('other', 'not_exiled', 'colombia'))
dp_balance$month<-as.character(dp_balance$month)

#write.csv(dp_balance, 'ReplicationData/balanced_panel.csv', row.names=F)


##############################
######from start of tweeting
dp_balance<-ddply(dp_balance, .(actor.id), transform, min_tweeting=min(month[num_tweets>0]))
dp_balance$min_tweeting<-as.character(as.Date(dp_balance$min_tweeting))
dp_with_zeros<-subset(dp_balance, month>=min_tweeting)

#write.csv(dp_with_zeros, 'ReplicationData/dp_with_zeros.csv', row.names=F)


## t-test analysis/data production
#function to perform pre/post t-test
ttest_terms<-function(days_since, term){
  t.test(term[data$days_since_exile<days_since&data$days_since_exile>=0],
         term[data$days_since_exile<0&data$days_since_exile>-days_since])
}


#plotting for pre/post t-test
ttest_multiple_periods<-function(periods, term, type, label){
  divisor<-ifelse(type=='percent of mean', mean(term), 1)
  ttest_result<-ttest_terms(periods[1], term)
  conf.low<-ttest_result$conf.int[1]/divisor*100
  conf.high<-ttest_result$conf.int[2]/divisor*100
  tval<-round(ttest_result$statistic, 2)
  p.value<-round(ttest_result$p.value, 2)
  degfree<-round(ttest_result$parameter, 2)
  est<-(ttest_result$estimate[1]/divisor-ttest_result$estimate[2]/divisor)*100
  tt_df<-data.frame(cbind(conf.low, conf.high, est, label, tval, p.value, degfree))
  
  for (i in 2:length(periods)){
    ttest_result<-ttest_terms(periods[i], term)
    conf.low<-ttest_result$conf.int[1]/divisor*100
    conf.high<-ttest_result$conf.int[2]/divisor*100
    est<-(ttest_result$estimate[1]/divisor-ttest_result$estimate[2]/divisor)*100
    tval<-round(ttest_result$statistic, 2)
    p.value<-round(ttest_result$p.value, 2)
    degfree<-ttest_result$parameter
    tt_df<-rbind(tt_df, data.frame(cbind(conf.low, conf.high, est, label, tval, p.value, degfree)))
  }
  tt_df$period<-periods
  tt_df
}

tt_foreign_policy<-ttest_multiple_periods(c(120, 60, 30), data$all_foreign_policy, 'prepost', 'Foreign Policy')
tt_military<-ttest_multiple_periods(c(120, 60, 30), data$military_terms, 'prepost', 'Military Intervention')
tt_sanctions<-ttest_multiple_periods(c(120, 60, 30), data$sanctions_terms, 'prepost', 'Sanctions')
tt_diplomacy<-ttest_multiple_periods(c(120, 60, 30), data$diplomacy_terms, 'prepost', 'Diplomacy')

tt_protest<-ttest_multiple_periods(c(120, 60, 30), data$protest_terms, 'prepost', 'Protest')

tt_criticism<-ttest_multiple_periods(c(120, 60, 30), data$harsh_criticism_terms, 'prepost', 'Criticism')
tt_narco<-ttest_multiple_periods(c(120, 60, 30), data$narco_terms, 'prepost', 'Narco-State')
tt_dictator<-ttest_multiple_periods(c(120, 60, 30), data$dictator_terms, 'prepost', 'Dictator')
tt_cuba<-ttest_multiple_periods(c(120, 60, 30), data$cuba_terms, 'prepost', 'Cuban/Russian Influence')
tt_repression<-ttest_multiple_periods(c(120, 60, 30), data$all_repression, 'prepost', 'Repression')

tt_services<-ttest_multiple_periods(c(120, 60, 30), data$service_terms, 'prepost', 'Services')


ttest_results<-rbind(tt_services, tt_repression, tt_cuba, tt_dictator, tt_narco,
                     tt_criticism,tt_protest, tt_diplomacy, tt_sanctions, tt_military,
                     tt_foreign_policy)

ttest_results$period<-factor(ttest_results$period, levels=c(120, 60, 30))
ttest_results$est<-as.numeric(as.character(ttest_results$est))
ttest_results$conf.low<-as.numeric(as.character(ttest_results$conf.low))
ttest_results$conf.high<-as.numeric(as.character(ttest_results$conf.high))

